Energy Consumption Forecasting with ML in France¶

Research Questions

To what extent can machine learning models predict daily electricity consumption across French regions, and how do weather and density features improve predictive accuracy?

Which features are most strongly associated with variations in consumption, and how can interpretable ML techniques (e.g., SHAP) be used to understand and communicate these associations?

In [1]:
# IMPORT LIBRARIES
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os 
In [4]:
#Load the dataset and define the target column, drop NA values from the dataset
cons_filepath = "france_energy_meteo_daily.csv"
energy_source_df = pd.read_csv(cons_filepath)
# Display basic info about the dataset 
energy_source_df.info()

# Setting the working directory
project_dir = os.getcwd()   
figures_path = os.path.join(project_dir, "output")
os.makedirs(figures_path, exist_ok=True)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56862 entries, 0 to 56861
Data columns (total 31 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         56862 non-null  object 
 1   region_name  56862 non-null  object 
 2   RR           56862 non-null  float64
 3   ALTI         56862 non-null  float64
 4   PMERM        56862 non-null  float64
 5   INST         56862 non-null  float64
 6   GLOT         56862 non-null  float64
 7   DHUMI40      56862 non-null  float64
 8   NEIGETOTX    56862 non-null  float64
 9   NEIG         56862 non-null  float64
 10  BROU         56862 non-null  float64
 11  ORAG         56853 non-null  float64
 12  GRESIL       47779 non-null  float64
 13  GRELE        47937 non-null  float64
 14  ROSEE        47894 non-null  float64
 15  VERGLAS      55496 non-null  float64
 16  SOLNEIGE     47761 non-null  float64
 17  GELEE        56364 non-null  float64
 18  FUMEE        56862 non-null  float64
 19  BRUME        56862 non-null  float64
 20  ECLAIR       47929 non-null  float64
 21  TX           56862 non-null  float64
 22  TNSOL        56862 non-null  float64
 23  FXY          56862 non-null  float64
 24  DRR          56862 non-null  float64
 25  superf       35064 non-null  float64
 26  population   35064 non-null  float64
 27  density      35064 non-null  float64
 28  Région       51864 non-null  object 
 29  Consumption  51864 non-null  float64
 30  Workday?     56862 non-null  int64  
dtypes: float64(27), int64(1), object(3)
memory usage: 13.4+ MB
In [5]:
# 1. Percentage of null values
null_percentage = np.round(100 - energy_source_df.notnull().mean() * 100)

# 2. Percentage of zeros (only for numeric columns)
zero_percentage = (energy_source_df.select_dtypes(include=[np.number]) == 0).mean() * 100
zero_percentage = np.round(zero_percentage)

# 3. Combine both summaries
summary_df = pd.DataFrame({
    "Percentage of Null Values": null_percentage,
    "Percentage of Zeros": zero_percentage
})

# 4. Display full summary
print(summary_df.to_string())
             Percentage of Null Values  Percentage of Zeros
ALTI                               0.0                  0.0
BROU                               0.0                 36.0
BRUME                              0.0                 29.0
Consumption                        9.0                  0.0
DHUMI40                            0.0                 42.0
DRR                                0.0                 24.0
Date                               0.0                  NaN
ECLAIR                            16.0                 76.0
FUMEE                              0.0                100.0
FXY                                0.0                  0.0
GELEE                              1.0                 91.0
GLOT                               0.0                  0.0
GRELE                             16.0                 73.0
GRESIL                            16.0                 82.0
INST                               0.0                  1.0
NEIG                               0.0                 87.0
NEIGETOTX                          0.0                 65.0
ORAG                               0.0                 74.0
PMERM                              0.0                  0.0
ROSEE                             16.0                 58.0
RR                                 0.0                  2.0
Région                             9.0                  NaN
SOLNEIGE                          16.0                 82.0
TNSOL                              0.0                  0.0
TX                                 0.0                  0.0
VERGLAS                            2.0                 88.0
Workday?                           0.0                  3.0
density                           38.0                  0.0
population                        38.0                  0.0
region_name                        0.0                  NaN
superf                            38.0                  0.0
In [6]:
# Drop rows with NAs
energy_source_df.dropna(inplace=True)
# Ensure the Date column is in a datetime format
energy_source_df["Date"] = pd.to_datetime(energy_source_df["Date"])
#Remove duplicated column
energy_source_df.drop(columns=["Région"], inplace=True)


# Place consumption data in the correct unit (kWh) and calculate the consumption per capita
energy_source_df["Consumption"] = energy_source_df["Consumption"] * 1000
energy_source_df["Consumption_per_capita"] = energy_source_df["Consumption"] / energy_source_df["population"]

# Add the day of the week 
energy_source_df["day_of_week"] = energy_source_df["Date"].dt.dayofweek.astype("category")
In [7]:
# Display basic info about the dataset 
energy_source_df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 28860 entries, 0 to 37985
Data columns (total 32 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Date                    28860 non-null  datetime64[ns]
 1   region_name             28860 non-null  object        
 2   RR                      28860 non-null  float64       
 3   ALTI                    28860 non-null  float64       
 4   PMERM                   28860 non-null  float64       
 5   INST                    28860 non-null  float64       
 6   GLOT                    28860 non-null  float64       
 7   DHUMI40                 28860 non-null  float64       
 8   NEIGETOTX               28860 non-null  float64       
 9   NEIG                    28860 non-null  float64       
 10  BROU                    28860 non-null  float64       
 11  ORAG                    28860 non-null  float64       
 12  GRESIL                  28860 non-null  float64       
 13  GRELE                   28860 non-null  float64       
 14  ROSEE                   28860 non-null  float64       
 15  VERGLAS                 28860 non-null  float64       
 16  SOLNEIGE                28860 non-null  float64       
 17  GELEE                   28860 non-null  float64       
 18  FUMEE                   28860 non-null  float64       
 19  BRUME                   28860 non-null  float64       
 20  ECLAIR                  28860 non-null  float64       
 21  TX                      28860 non-null  float64       
 22  TNSOL                   28860 non-null  float64       
 23  FXY                     28860 non-null  float64       
 24  DRR                     28860 non-null  float64       
 25  superf                  28860 non-null  float64       
 26  population              28860 non-null  float64       
 27  density                 28860 non-null  float64       
 28  Consumption             28860 non-null  float64       
 29  Workday?                28860 non-null  int64         
 30  Consumption_per_capita  28860 non-null  float64       
 31  day_of_week             28860 non-null  category      
dtypes: category(1), datetime64[ns](1), float64(28), int64(1), object(1)
memory usage: 7.1+ MB
In [8]:
# Display descriptive statistics for the remaining columns
energy_source_df.describe()
Out[8]:
Date RR ALTI PMERM INST GLOT DHUMI40 NEIGETOTX NEIG BROU ... TX TNSOL FXY DRR superf population density Consumption Workday? Consumption_per_capita
count 28860 28860.000000 28860.000000 28860.000000 28860.000000 28860.000000 28860.000000 28860.000000 28860.000000 28860.000000 ... 28860.000000 28860.000000 28860.000000 28860.000000 28860.000000 2.886000e+04 28860.000000 2.886000e+04 28860.000000 28860.000000
mean 2017-10-05 05:30:42.661122816 2.493392 324.260537 1017.240516 336.242005 1281.116142 65.949851 2.677550 0.069322 0.277349 ... 16.743264 6.080479 6.729508 111.696284 44493.318204 5.552967e+06 200.393641 2.189303e+08 0.970132 40.370948
min 2014-01-01 00:00:00 0.000000 64.802817 975.620000 0.000000 34.222222 0.000000 0.000000 0.000000 0.000000 ... -5.798061 -14.300000 1.938889 0.000000 12064.688640 2.576252e+06 58.359735 5.959200e+07 0.000000 18.378166
25% 2015-10-18 00:00:00 0.052136 98.726010 1012.759286 123.000000 556.265203 0.000000 0.000000 0.000000 0.000000 ... 11.112390 1.728529 5.146868 0.428571 30118.557770 3.330478e+06 78.061664 1.343175e+08 1.000000 33.248847
50% 2017-08-04 12:00:00 0.448293 162.059902 1017.542262 304.354167 1169.823529 1.217708 0.000000 0.000000 0.200000 ... 16.426581 6.062500 6.303510 29.649123 32363.434265 5.127840e+06 112.923194 1.968565e+08 1.000000 39.245922
75% 2019-09-11 06:00:00 3.024985 532.332191 1022.420357 525.848750 1936.217568 63.659572 0.679434 0.000000 0.458333 ... 22.175000 10.517269 7.874243 161.925000 70795.544145 6.009976e+06 158.830373 2.828795e+08 1.000000 46.793050
max 2021-12-31 00:00:00 83.744086 916.616505 1047.266667 912.222222 3131.843137 930.432099 61.356436 1.000000 1.000000 ... 41.707692 20.911111 20.612766 1420.142857 85103.128115 1.231728e+07 1020.936335 6.568900e+08 1.000000 75.535308
std NaN 4.519007 262.626162 8.401296 241.866574 811.431997 128.717643 7.472844 0.201075 0.276446 ... 7.338474 5.739378 2.232564 168.771991 22721.036332 2.722115e+06 273.265424 1.041739e+08 0.170227 9.586921

8 rows × 30 columns

In [9]:
# Log lin the outcome to Aumentar estabilidad numérica en modelos como SVR o LinearRegression
energy_source_df['LOG_Consumption_per_capita'] = np.log1p(energy_source_df['Consumption_per_capita'])  # computes ln(1 + ingreso)
In [10]:
# Set style
sns.set(style="whitegrid")

# Create subplots
fig, axs = plt.subplots(1, 2, figsize=(14, 5))

# Original Income
sns.histplot(energy_source_df['Consumption_per_capita'], bins=50, ax=axs[0], kde=True)
axs[0].set_title("Original Consumption_per_capita Distribution")
axs[0].set_xlabel("Consumption_per_capita")

# Log Income
sns.histplot(energy_source_df['LOG_Consumption_per_capita'], bins=50, ax=axs[1], kde=True, color='orange')
axs[1].set_title("Log(1 + LOG_Consumption_per_capita) Distribution")
axs[1].set_xlabel("LOG_Consumption_per_capita")

plt.tight_layout()
plt.show()
No description has been provided for this image

Looking at the distributions:

  • Original Consumption_per_capita is already fairly symmetric, bell-shaped, and doesn’t have extreme skewness.
  • The log-transformed version is slightly tighter, but not substantially different — and definitely not enough to justify the loss in interpretability for tree-based and deep learning models.
In [11]:
# Target variable
target = "Consumption_per_capita"

# Identify datetime column
#date_features = ["Date"]

# Identify categorical features 
#categorical_features = ["region_name"]

# Identify numeric features (excluding target, total consumption, and population, region, date)
numeric_features = [
    "RR", "ALTI", "PMERM", "INST", "GLOT", "DHUMI40", "NEIGETOTX", "BROU", "ORAG",
    "GRESIL", "GRELE", "ROSEE", "VERGLAS", "GELEE", "FUMEE", "BRUME", "ECLAIR",
    "TX", "TNSOL", "FXY", "DRR", "density", "Workday?", "day_of_week"
]

Train-Test Split¶

To evaluate model performance fairly, we divide our dataset into training and testing subsets. This ensures that we train our models on one portion of the data and test their predictive ability on unseen examples.

  • Training Set (90%): Used to train the model
  • Testing Set (20%): Held out for evaluating out-of-sample performance
In [12]:
# Define the features (X) and target (y)
X = energy_source_df[numeric_features]
y = energy_source_df[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
X_train shape: (23088, 24)
X_test shape: (5772, 24)
y_train shape: (23088,)
y_test shape: (5772,)

Data Exploration¶

In [13]:
# Step 1: Concatenate X_train and y_train 
train_data = pd.concat([X_train, y_train], axis=1)

train_data.describe()
Out[13]:
RR ALTI PMERM INST GLOT DHUMI40 NEIGETOTX BROU ORAG GRESIL ... FUMEE BRUME ECLAIR TX TNSOL FXY DRR density Workday? Consumption_per_capita
count 23088.000000 23088.000000 23088.000000 23088.000000 23088.000000 23088.000000 23088.000000 23088.000000 23088.000000 23088.000000 ... 23088.000000 23088.000000 23088.000000 23088.000000 23088.000000 23088.000000 23088.000000 23088.000000 23088.000000 23088.000000
mean 2.488422 323.613836 1017.240810 337.011488 1284.290223 65.865411 2.712122 0.276783 0.122969 0.008952 ... 0.000491 0.256514 0.037379 16.760534 6.089156 6.727186 111.885833 200.763206 0.970591 40.354093
std 4.488603 262.551887 8.375842 242.236362 812.030615 128.187061 7.555000 0.276167 0.248688 0.061797 ... 0.007341 0.263758 0.140028 7.353359 5.735963 2.226330 168.423831 273.731044 0.168954 9.563325
min 0.000000 64.802817 975.620000 0.000000 34.222222 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 -5.798061 -14.300000 2.028571 0.000000 58.359735 0.000000 18.639506
25% 0.052136 98.053006 1012.730326 124.000000 555.025000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.043478 0.000000 11.100595 1.731264 5.151212 0.426692 78.061664 1.000000 33.260815
50% 0.451570 161.421908 1017.551190 305.461538 1178.472222 1.245679 0.000000 0.200000 0.000000 0.000000 ... 0.000000 0.166667 0.000000 16.452994 6.066667 6.307456 29.788889 112.923194 1.000000 39.206134
75% 3.048856 531.752215 1022.387847 526.905769 1940.062500 64.048377 0.684211 0.457143 0.111111 0.000000 ... 0.000000 0.400000 0.000000 22.207692 10.543750 7.869230 162.517857 158.830373 1.000000 46.746905
max 65.095930 916.616505 1047.266667 912.222222 3129.000000 930.432099 61.356436 1.000000 1.000000 1.000000 ... 0.375000 1.000000 1.000000 41.707692 20.911111 20.612766 1393.111111 1020.936335 1.000000 75.535308

8 rows × 24 columns

In [15]:
# 1) Setup
# Set style
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (15, 5)

# 2) Define continuous variables
numeric_features = [
    "RR", "ALTI", "PMERM", "INST", "GLOT", "DHUMI40", "NEIGETOTX", "BROU", "ORAG",
    "GRESIL", "GRELE", "ROSEE", "VERGLAS", "GELEE", "FUMEE", "BRUME", "ECLAIR",
    "TX", "TNSOL", "FXY", "DRR", "density", "Workday?", "day_of_week"
]

# 3) Define categorical variables as the remaining columns
all_columns = train_data.columns.tolist()
#categorical_vars = list(set(all_columns) - set(continuous_vars))

# 4) Force types 
#train data
train_data[numeric_features] = train_data[numeric_features].astype(float)

#test data
train_data[numeric_features] = train_data[numeric_features].astype(float)
In [16]:
# 3) Plot Continuous Variables (Histograms)
n_cols = 3
n_rows = -(-len(numeric_features) // n_cols)  # Ceiling division

fig, axs = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
axs = axs.flatten()

for i, col in enumerate(numeric_features):
    sns.histplot(data=train_data, x=col, ax=axs[i], kde=True, bins=30)
    axs[i].set_title(f"Distribution of {col}")
    axs[i].tick_params(axis='x', rotation=45)

# Remove unused subplots
for j in range(i + 1, len(axs)):
    fig.delaxes(axs[j])

plt.tight_layout()

# Save the figure
histogram_path = os.path.join(figures_path, "numeric_features_histograms.png")
plt.savefig(histogram_path, dpi=300)

# Show the plot
plt.show()
No description has been provided for this image
In [18]:
# Plot pairplot to visualize relationships between features
sns.pairplot(train_data[numeric_features])
plt.suptitle("Pairplot of Features")
plt.show()
No description has been provided for this image
In [19]:
# Calculate the correlation matrix
correlation_matrix = train_data.corr()

# Plot the heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of the Data')


# Save the figure
heatmap_path = os.path.join(figures_path, "correlation_matrix.png")
plt.savefig(heatmap_path, dpi=300, bbox_inches='tight')

# Show the plot
plt.show()
No description has been provided for this image

Preparing the Data¶

After the exploratory phase, including the inspection of distributions and correlation structures, we now prepare the data for predictive modeling.

This step involves:

  • Handling missing values (already addressed)
  • Engineering polynomial and interaction terms
  • Scaling and encoding features appropriately
  • Building preprocessing pipelines for continuous and categorical variables
  • This transformation ensures that the data is clean, structured, and model-ready, allowing us to evaluate the importance of education and other variables in predicting income.
In [20]:
"""
Create feature and target variables. Drop categorical variables and highly correlated features
"""
#X = energy_source_df.drop(columns=[target, "Date", 'region_name', 'Région', "Consumption", "population", 'NEIG', 'SOLNEIGE'])
#y = energy_source_df[target]
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a K-fold cross-validator instead of TimeSeriesSplit
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# For models that need scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
In [ ]:
# Function to show results

"""
This function plots the predictions versus the true values of the test
data for consumption per capita target. All of these graphs are placed
in the report, so a distinguishing title is given to each during input.
The model is also defined in input, to properly differentiate the results
and compare between models.
"""
def plot_predictions(y_true, y_pred, model_name='Model', figure_number="Figure One"):
    plt.figure(figsize=(10, 6))

    plt.scatter(y_true, y_pred, alpha=0.6)

    max_val = max(y_true.max(), y_pred.max())
    min_val = min(y_true.min(), y_pred.min())
    plt.plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2)

    plt.title(r"$\bf{" + figure_number + r"}$ — Actual vs Predicted — " + model_name, fontsize=20)
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.grid(True)
    plt.tight_layout()
    plt.show()
In [23]:
# (2) Decision Tree

# Perform a GridSearchCV that performs cross validation and tuning at the same time. Necessary to
# establish the best performing model for model comparison.
dt_params = {
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt_grid = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    dt_params,
    cv=kfold,
    scoring='neg_mean_absolute_error',
    n_jobs=-1
)

# Fit training data and cross validate to find best parameters, Generate from Best Estimator in GridSearch, , Predict
dt_grid.fit(X_train, y_train)
dt_model = dt_grid.best_estimator_
y_pred_dt = dt_model.predict(X_test)


# Evaluation Metrics
mae_dt = mean_absolute_error(y_test, y_pred_dt)
rmse_dt = np.sqrt(mean_squared_error(y_test, y_pred_dt))
r2_dt = r2_score(y_test, y_pred_dt)

print("\nDecision Tree Regression:")
print("Best Parameters:", dt_grid.best_params_)
print("MAE:", mae_dt)
print("RMSE:", rmse_dt)
print("R-squared:", r2_dt)
plot_predictions(y_test, y_pred_dt, model_name='Decision Tree', figure_number="Figure\ Five")

# Print a decision tree to communicate the ease in which a planner could explain the
# prediction from a decision tree. Visual included in the appendix of the report.
plt.figure(figsize=(15, 10))
plot_tree(dt_model,
          feature_names=X.columns,
          filled=True,
          proportion=False,
          precision=2,
          impurity=True,
          max_depth=3,
          fontsize=10,
          node_ids=False,)
plt.title(r"$\bf{Appendix\ One}$ Decision Tree Structure for Energy Consumption Prediction", fontsize=20)
plt.tight_layout(pad=1.0)
plt.show()
<>:35: SyntaxWarning: invalid escape sequence '\ '
<>:35: SyntaxWarning: invalid escape sequence '\ '
/tmp/ipykernel_144016/1575183619.py:35: SyntaxWarning: invalid escape sequence '\ '
  plot_predictions(y_test, y_pred_dt, model_name='Decision Tree', figure_number="Figure\ Five")
Decision Tree Regression:
Best Parameters: {'max_depth': 15, 'min_samples_leaf': 4, 'min_samples_split': 10}
MAE: 2.2628588453014955
RMSE: 3.110526550173673
R-squared: 0.8967528173994302
No description has been provided for this image
No description has been provided for this image
In [24]:
# (3) Random Forests

# Perform a GridSearchCV that performs cross validation and tuning at the same time. Necessary to
# establish the best performing model for model comparison.
rf_params = {
    'n_estimators': [100, 200], #300
    'max_depth': [10, 30], #None, 50
    'min_samples_split': [2, 5], #10
    'min_samples_leaf': [1, 2] #4
    }

rf_grid = GridSearchCV(
    RandomForestRegressor(random_state=42),
    rf_params,
    cv=kfold,
    scoring='neg_mean_absolute_error',
    n_jobs=-1
)


# Fit training data and cross validate to find best parameters, Generate from Best Estimator in GridSearch, , Predict
rf_grid.fit(X_train, y_train)
rf_model = rf_grid.best_estimator_
y_pred_rf = rf_model.predict(X_test)

# Evaluation Metrics
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

print("\nRandom Forest Regression:")
print("Best Parameters:", rf_grid.best_params_)
print("MAE:", mae_rf)
print("RMSE:", rmse_rf)
print("R-squared:", r2_rf)
plot_predictions(y_test, y_pred_rf, model_name='Random Forest', figure_number="Figure\ Four")
<>:36: SyntaxWarning: invalid escape sequence '\ '
<>:36: SyntaxWarning: invalid escape sequence '\ '
/tmp/ipykernel_144016/103260272.py:36: SyntaxWarning: invalid escape sequence '\ '
  plot_predictions(y_test, y_pred_rf, model_name='Random Forest', figure_number="Figure\ Four")
Random Forest Regression:
Best Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
MAE: 1.7192654410996708
RMSE: 2.343384738711332
R-squared: 0.9414000050349551
No description has been provided for this image
In [25]:
# (4) Support Vector Machines - Linear Function

# Perform a GridSearchCV that performs cross validation and tuning at the same time. Necessary to
# establish the best performing model for model comparison.
svr_linear_params = {
    'C': [1.0, 10.0], #, 100.0, 1000.0
    'epsilon': [0.001, 0.01] #, 0.1
}

svr_linear_grid = GridSearchCV(
    SVR(kernel='linear'),
    svr_linear_params,
    cv=kfold,
    scoring='neg_mean_absolute_error',
    n_jobs=-1
)

# Fit and cross validate with scaled training, Generate from Best Estimator in GridSearch, Predict to scaled test
svr_linear_grid.fit(X_train_scaled, y_train)
svr_linear_model = svr_linear_grid.best_estimator_
y_pred_svr_linear = svr_linear_model.predict(X_test_scaled)

# Evaluation Metrics
mae_svr_linear = mean_absolute_error(y_test, y_pred_svr_linear)
rmse_svr_linear = np.sqrt(mean_squared_error(y_test, y_pred_svr_linear))
r2_svr_linear = r2_score(y_test, y_pred_svr_linear)

print("\nSVR (Linear):")
print("Best Parameters:", svr_linear_grid.best_params_)
print("MAE:", mae_svr_linear)
print("RMSE:", rmse_svr_linear)
print("R-squared:", r2_svr_linear)
plot_predictions(y_test, y_pred_svr_linear, model_name='SVR Linear', figure_number="Appendix\ Three")
<>:33: SyntaxWarning: invalid escape sequence '\ '
<>:33: SyntaxWarning: invalid escape sequence '\ '
/tmp/ipykernel_144016/1056758878.py:33: SyntaxWarning: invalid escape sequence '\ '
  plot_predictions(y_test, y_pred_svr_linear, model_name='SVR Linear', figure_number="Appendix\ Three")
SVR (Linear):
Best Parameters: {'C': 1.0, 'epsilon': 0.01}
MAE: 4.1727316326458
RMSE: 5.234531226606386
R-squared: 0.7076076713552839
No description has been provided for this image
In [26]:
# (5) Support Vector Machines - Radial Base Function

# Perform a GridSearchCV that performs cross validation and tuning at the same time. Necessary to
# establish the best performing model for model comparison.
svr_rbf_params = {
    'C': [0.1, 1.0],#, 10.0, 100.0
    'gamma': ['auto', 0.1],#'scale', , 0.01
    'epsilon': [0.1, 0.2]#, 0.3
}

svr_rbf_grid = GridSearchCV(
    SVR(kernel='rbf'),
    svr_rbf_params,
    cv=kfold,
    scoring='neg_mean_absolute_error',
    n_jobs=-1
)

# Fit and cross validate with scaled training, Generate from Best Estimator in GridSearch, Predict to scaled test
svr_rbf_grid.fit(X_train_scaled, y_train)
svr_rbf_model = svr_rbf_grid.best_estimator_
y_pred_svr_rbf = svr_rbf_model.predict(X_test_scaled)

# Evaluation Metrics
mae_svr_rbf = mean_absolute_error(y_test, y_pred_svr_rbf)
rmse_svr_rbf = np.sqrt(mean_squared_error(y_test, y_pred_svr_rbf))
r2_svr_rbf = r2_score(y_test, y_pred_svr_rbf)

print("\nSVR (RBF):")
print("Best Parameters:", svr_rbf_grid.best_params_)
print("MAE:", mae_svr_rbf)
print("RMSE:", rmse_svr_rbf)
print("R-squared:", r2_svr_rbf)
plot_predictions(y_test, y_pred_svr_rbf, model_name='SVR RBF', figure_number="Appendix\ Four")
<>:34: SyntaxWarning: invalid escape sequence '\ '
<>:34: SyntaxWarning: invalid escape sequence '\ '
/tmp/ipykernel_144016/2667293759.py:34: SyntaxWarning: invalid escape sequence '\ '
  plot_predictions(y_test, y_pred_svr_rbf, model_name='SVR RBF', figure_number="Appendix\ Four")
SVR (RBF):
Best Parameters: {'C': 1.0, 'epsilon': 0.2, 'gamma': 'auto'}
MAE: 2.950271659557079
RMSE: 3.7524588757336534
R-squared: 0.8497403866527125
No description has been provided for this image